Sac cer annotations
===================
I modified the following files after downloading from the database to match columns in annotation files for other organisms

genes.txt = http://hgdownload.cse.ucsc.edu/goldenPath/sacCer1/database/ensGene.txt.gz       updated        19-Apr-2008 03:11  150K  

cytoband.txt = http://hgdownload.cse.ucsc.edu/goldenPath/sacCer1/database/chromInfo.txt.gz             14-Jan-2004 17:03  218  

annotations.txt is obtained by running the script below

mkdir Saccharomyces_cerevisiae
mkdir Saccharomyces_cerevisiae/sacCer1
echo "Started creating annotations.txt for Yeast sacCer1" `date`
perl -MLWP::Simple -e 'getprint "http://hgdownload.cse.ucsc.edu/goldenPath/sacCer1/database/chromInfo.txt.gz"' | gzip -cd | perl -aF/\\t/ -ne 'chomp; /^chr/ or next; print join("\t",$F[0],0,$F[1],"","gneg"),"\n"' | sort -k1.4 -n > Saccharomyces_cerevisiae/sacCer1/cytoband.txt
#perl -MLWP::Simple -e 'getprint "ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/cfa.gff"' > Saccharomyces_cerevisiae/sacCer1/mirnas.txt
echo "Started conversion of Saccharomyces_cerevisiae.ags.gz into tab delimited text file" `date`
perl -MLWP::Simple -e 'getprint "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/ASN_BINARY/Fungi/Saccharomyces_cerevisiae.ags.gz"' | gzip -cd | ./bin/gene2xml.exe -x -b | perl -pe 's/\cM//g' | perl ./perl/convert2_entrez_annotations.pl >  Saccharomyces_cerevisiae.all_annotations.txt
echo "Completed conversion of Saccharomyces_cerevisiae.ags.gz" `date`
perl -MLWP::Simple -e 'getprint "http://hgdownload.cse.ucsc.edu/goldenPath/sacCer1/database/ensGene.txt.gz"' | gzip -cd | perl -aF/\\t/ -ne 'shift @F; print join("\t",@F)' > Saccharomyces_cerevisiae/sacCer1/genes.txt
cp entrez_genes.txt Saccharomyces_cerevisiae/sacCer1
perl ./perl/create_annotations.txt.files.pl Saccharomyces_cerevisiae.all_annotations.txt < Saccharomyces_cerevisiae/sacCer1/genes.txt > Saccharomyces_cerevisiae/sacCer1/annotations.txt
perl ./perl/create_annotations.txt.files.pl Saccharomyces_cerevisiae.all_annotations.txt < Saccharomyces_cerevisiae/sacCer1/entrez_genes.txt > Saccharomyces_cerevisiae/sacCer1/entrez_annotations.txt
perl perl/convert_Romannumerals.pl Saccharomyces_cerevisiae/sacCer1/entrez_genes.txt
echo "All Saccharomyces_cerevisiae annotations done" `date`

#!/bin/perl
# First download an ags file from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/ASN_BINARY/../.ags.gz
#Usage : gzip -cd Homo_sapiens.ags.gz | ./gene2xml.exe -x -b |  perl -pe 's/\cM//g' | perl get_entrez_annotations.pl > all_annotations.txt
# gene2xml (from ftp://ftp.ncbi.nlm.nih.gov/toolbox/ncbi_tools/converters/by_program/) converts to ASN.1 formatted text from binary using -x and -b flags and s/\cM/ removes microsoft line endings
# This creates a genes.txt and an annotations file that can be pruned later to include only gene symbols in the genes.txt file.
use strict;
use Bio::ASN1::EntrezGene; # parser installed using perl -MCPAN -e 'install M/MI/MINGYILIU/Bio-ASN1-EntrezGene-1.091.tgz'
my @ontos = qw(Process Component Function); # these are the GO categories stored in the ASN.1 file
my $parser = Bio::ASN1::EntrezGene->new('fh' => *STDIN); # uses STDIN as filehandle so we can pipe in gene2xml output
my (%annos, %all_dbids);
my $taxon = shift; # A taxon id can be passed in if the file contains multiple organisms
open GENE, ">entrez_genes.txt" or die "Cannot create entrez_genes.txt\n";
my $hdrs = "Symbol	Name	Description	Biological process	Cellular component	Molecular function	LocusLink ID	Other Aliases";
while (my $result = $parser->next_seq) # Goes thru each gene for its annotations
{
	my $seq = $result->[0]; # ref to the annotation
	my $taxonid = $seq->{source}->[0]->{org}->[0]->{db}->[0]->{tag}->[0]->{id};
	$taxon && $taxon ne $taxonid and next;
	my $gene = $seq->{gene}->[0]; # ref to the gene info
	my $geneid = $seq->{'track-info'}->[0]->{geneid}; # gene id	
	my $sym = $gene->{locus} || $gene->{'locus-tag'}; # official symbol
	$sym ||= ref $gene->{syn} eq "ARRAY" && $gene->{syn}->[0] || $geneid && "LOC".$geneid || "";
	my $status = $seq->{'track-info'}->[0]->{status}; # annotation status such as live/discontinued
	$status =~ /discontinued/ and next;
	my $chromosome = "chr".$seq->{source}->[0]->{subtype}->[0]->{name};
	my $name = $gene->{desc}; # verbose name
	my $desc = $seq->{summary};	# good detailed description
	my $db_ref = $gene->{db}; # ext database ids
	my %db_ids = map {($db_ref->[$_]->{db}, $db_ref->[$_]->{tag}->[0]->{id} || $db_ref->[$_]->{tag}->[0]->{str})} 0..$#$db_ref;
	$all_dbids{$_}++ for sort keys %db_ids;
	if($seq->{locus})
	{
    foreach my $l (@{$seq->{locus}})
    {
#	  $l->{accession} =~ /^NC_/ or next; #(warn "Skipped Accession= ",$l->{accession},"\n" and next);
#		$l->{accession} or next;
	  my $strand = $l->{seqs}->[0]->{int}->[0]->{strand};
	  $strand = $strand =~ /minus/ ? "-" : "+";
      if($l->{products})
      {
        foreach my $p (@{$l->{products}})
        {
#			$p->{accession} =~ /^NM_/ or next; #(warn "Accession = ",$p->{accession},"\n" and next);
#			$p->{accession} or next;
            my $exon_refe = $p->{'genomic-coords'}->[0]->{mix}->[0]->{int} || $p->{'genomic-coords'}->[0]->{'packed-int'} || $p->{'genomic-coords'}->[0]->{'int'} or next;
			my @froms = map {$exon_refe->[$_]->{from}} 0..$#$exon_refe;
			my @tos = map {$exon_refe->[$_]->{to}} 0..$#$exon_refe;
			print GENE join("\t", $sym, $p->{accession}, $chromosome, $strand, $froms[0], $tos[-1], $froms[0], $tos[-1], scalar @froms, join (",", @froms, ""), join(",", @tos, "")),"\n";
		}
	  }
	}
	}	
	my %gos; 
	if ($seq->{properties})
	{
	for my $p (@{$seq->{properties}}) # Goes thru each property and looks for 'GeneOntology'
    {
		$p->{heading} eq 'GeneOntology' or next;
        for my $c (@{$p->{comment}}) # GO Category is stored as comment->label and terms stored as array in comment
        {
			my $category = $c->{label};
			for my $c1 (@{$c->{comment}})
			{
				my $term = $c1->{source}->[0]->{anchor};
				$term =~ s/,/;/g;
				push @{$gos{$category}}, $term;
			}
        }
	}
	}
	my @goterms = map {ref $gos{$_} eq "ARRAY" ? join "\, ",@{$gos{$_}} : ""} @ontos;
	my $syn_ref = $gene->{syn}; # ref to synonym array
	my @syns = ref $syn_ref eq "ARRAY" ? @$syn_ref : (); # if there are synonyms then add them to @syns
	my $official_sym = $sym || shift(@syns) or next; #(warn "No official symbol for ", join ("\t", $name, $desc, $geneid, @syns), "\n" and next);
	$annos{$official_sym} = $annos{"LOC".$geneid} = [$name, $desc, @goterms, $geneid, join ("\, ", @syns), \%db_ids];
}
my @dbid_hdrs = sort keys %all_dbids;
#warn "dbids reported are @dbid_hdrs\n";
print join("\t", $hdrs, @dbid_hdrs),"\n";
for (sort keys %annos)
{
	my @annots = @{$annos{$_}};
	my $db_ref = pop @annots;
	my @dbs = map {ref $db_ref eq "HASH" ? $db_ref->{$_} : ()} @dbid_hdrs;
	print join("\t",$_,@annots, @dbs),"\n";
}
